Load all the packages we need

library(readxl)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.2.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(RColorBrewer)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(choroplethr)
## Loading required package: acs
## Loading required package: XML
## 
## Attaching package: 'acs'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:base':
## 
##     apply
library(choroplethrMaps)
library(ggbeeswarm)
library(tidyverse)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggbeeswarm)

load all the data we are going to use here and do summary of each dataset.

Summary statistics such as mean,median,mode and quartiles for each variable.

COR_Adult_CCcontact <- read_excel("COR_Adult_CCcontact.xlsx")
summary(COR_Adult_CCcontact) 
##     Caseload                    ActualValue   
##  Min.   :2011-10-11 00:00:00   Min.   :15255  
##  1st Qu.:2013-08-13 00:00:00   1st Qu.:15614  
##  Median :2015-02-15 00:00:00   Median :16727  
##  Mean   :2015-01-26 15:03:55   Mean   :16683  
##  3rd Qu.:2016-07-16 00:00:00   3rd Qu.:17531  
##  Max.   :2017-12-17 00:00:00   Max.   :18518  
##     Forecast                   ForecastValue      Variance      
##  Min.   :2011-11-11 00:00:00   Min.   :15056   Min.   :-1276.0  
##  1st Qu.:2013-03-13 00:00:00   1st Qu.:15433   1st Qu.:    0.0  
##  Median :2014-02-14 00:00:00   Median :15812   Median :   64.0  
##  Mean   :2014-03-05 08:11:42   Mean   :16251   Mean   :  432.3  
##  3rd Qu.:2015-02-15 00:00:00   3rd Qu.:17066   3rd Qu.: 1064.0  
##  Max.   :2017-11-17 00:00:00   Max.   :18608   Max.   : 2506.0  
##   VariancePerc      
##  Min.   :-0.077188  
##  1st Qu.: 0.000000  
##  Median : 0.003643  
##  Mean   : 0.027801  
##  3rd Qu.: 0.066631  
##  Max.   : 0.160744
MS_ADO_AEM <- read_excel("MS_ADO_AEM.xlsx")
summary(MS_ADO_AEM) 
##     Caseload                    ActualValue    
##  Min.   :2015-10-01 00:00:00   Min.   : 892.0  
##  1st Qu.:2016-12-01 00:00:00   1st Qu.: 963.8  
##  Median :2017-07-01 00:00:00   Median :1019.5  
##  Mean   :2017-05-28 20:00:00   Mean   :1006.3  
##  3rd Qu.:2018-01-01 00:00:00   3rd Qu.:1046.0  
##  Max.   :2018-05-01 00:00:00   Max.   :1089.0  
##     Forecast                   ForecastValue       Variance      
##  Min.   :2016-02-01 00:00:00   Min.   : 902.4   Min.   :-92.000  
##  1st Qu.:2016-06-01 00:00:00   1st Qu.: 945.1   1st Qu.: -2.814  
##  Median :2016-11-01 00:00:00   Median : 994.5   Median : 26.503  
##  Mean   :2016-12-02 15:37:08   Mean   : 982.4   Mean   : 23.844  
##  3rd Qu.:2017-06-01 00:00:00   3rd Qu.:1014.8   3rd Qu.: 53.417  
##  Max.   :2018-06-01 00:00:00   Max.   :1057.0   Max.   :124.980  
##   VariancePerc      
##  Min.   :-0.092555  
##  1st Qu.:-0.002876  
##  Median : 0.026227  
##  Mean   : 0.024724  
##  3rd Qu.: 0.054438  
##  Max.   : 0.129644
MS_ADO_MCS_Alien <- read_excel("MS_ADO_MCS_Alien.xlsx")
summary(MS_ADO_MCS_Alien) 
##     Caseload                    ActualValue  
##  Min.   :2015-10-01 00:00:00   Min.   :1856  
##  1st Qu.:2016-12-01 00:00:00   1st Qu.:2023  
##  Median :2017-07-01 00:00:00   Median :2103  
##  Mean   :2017-05-28 20:00:00   Mean   :2084  
##  3rd Qu.:2018-01-01 00:00:00   3rd Qu.:2174  
##  Max.   :2018-05-01 00:00:00   Max.   :2183  
##     Forecast                   ForecastValue     Variance      
##  Min.   :2016-02-01 00:00:00   Min.   :1857   Min.   :-51.000  
##  1st Qu.:2016-06-01 00:00:00   1st Qu.:1935   1st Qu.:  4.948  
##  Median :2016-11-01 00:00:00   Median :2003   Median : 53.095  
##  Mean   :2016-12-02 15:37:08   Mean   :2021   Mean   : 63.305  
##  3rd Qu.:2017-06-01 00:00:00   3rd Qu.:2080   3rd Qu.:102.163  
##  Max.   :2018-06-01 00:00:00   Max.   :2216   Max.   :250.875  
##   VariancePerc      
##  Min.   :-0.026087  
##  1st Qu.: 0.002354  
##  Median : 0.025596  
##  Mean   : 0.031847  
##  3rd Qu.: 0.050414  
##  Max.   : 0.129844
MS_ADO_Total <- read_excel("MS_ADO_Total.xlsx")
summary(MS_ADO_Total) 
##     Caseload                    ActualValue    
##  Min.   :2007-08-01 00:00:00   Min.   :221062  
##  1st Qu.:2011-04-01 00:00:00   1st Qu.:253642  
##  Median :2013-08-01 00:00:00   Median :256924  
##  Mean   :2013-07-12 10:41:34   Mean   :257778  
##  3rd Qu.:2015-10-01 00:00:00   3rd Qu.:265125  
##  Max.   :2018-05-01 00:00:00   Max.   :280098  
##     Forecast                   ForecastValue       Variance       
##  Min.   :2007-11-01 00:00:00   Min.   :221093   Min.   :-65521.7  
##  1st Qu.:2010-06-01 00:00:00   1st Qu.:253559   1st Qu.:-12177.7  
##  Median :2012-11-01 00:00:00   Median :263891   Median : -1514.9  
##  Mean   :2012-09-20 04:34:30   Mean   :264817   Mean   : -7038.9  
##  3rd Qu.:2015-02-01 00:00:00   3rd Qu.:273677   3rd Qu.:   648.8  
##  Max.   :2018-06-01 00:00:00   Max.   :320117   Max.   : 11822.0  
##   VariancePerc      
##  Min.   :-0.204681  
##  1st Qu.:-0.043609  
##  Median :-0.005863  
##  Mean   :-0.023138  
##  3rd Qu.: 0.002553  
##  Max.   : 0.048505
MS_ADO_QMB <- read_excel("MS_ADO_QMB.xlsx")
summary(MS_ADO_QMB) 
##     Caseload                    ActualValue   
##  Min.   :2007-08-01 00:00:00   Min.   :14443  
##  1st Qu.:2007-12-01 00:00:00   1st Qu.:15106  
##  Median :2008-02-15 12:00:00   Median :15600  
##  Mean   :2008-02-04 15:36:00   Mean   :15402  
##  3rd Qu.:2008-05-01 00:00:00   3rd Qu.:15670  
##  Max.   :2008-06-01 00:00:00   Max.   :15713  
##     Forecast                   ForecastValue      Variance     
##  Min.   :2007-11-01 00:00:00   Min.   :14200   Min.   :  53.0  
##  1st Qu.:2007-11-01 00:00:00   1st Qu.:14404   1st Qu.: 399.8  
##  Median :2007-11-01 00:00:00   Median :14656   Median : 764.0  
##  Mean   :2007-12-24 12:00:00   Mean   :14709   Mean   : 693.7  
##  3rd Qu.:2008-02-01 00:00:00   3rd Qu.:14827   3rd Qu.: 972.6  
##  Max.   :2008-06-01 00:00:00   Max.   :15546   Max.   :1178.1  
##   VariancePerc     
##  Min.   :0.003683  
##  1st Qu.:0.026488  
##  Median :0.052760  
##  Mean   :0.047481  
##  3rd Qu.:0.066396  
##  Max.   :0.081054
ESA_TANF_Child_Only <- read_excel("ESA_TANF_Child_Only.xlsx")
summary(ESA_TANF_Child_Only)
##     Caseload                    ActualValue   
##  Min.   :2010-07-01 00:00:00   Min.   :11847  
##  1st Qu.:2013-01-01 00:00:00   1st Qu.:13865  
##  Median :2014-07-01 00:00:00   Median :15024  
##  Mean   :2014-08-04 14:56:53   Mean   :15976  
##  3rd Qu.:2016-05-01 00:00:00   3rd Qu.:17043  
##  Max.   :2018-07-01 00:00:00   Max.   :25621  
##     Forecast                   ForecastValue      Variance      
##  Min.   :2011-06-01 00:00:00   Min.   :11671   Min.   :-4882.0  
##  1st Qu.:2012-11-01 00:00:00   1st Qu.:13862   1st Qu.: -717.0  
##  Median :2014-02-01 00:00:00   Median :15801   Median :  -70.0  
##  Mean   :2014-03-02 17:03:06   Mean   :16477   Mean   : -501.1  
##  3rd Qu.:2015-06-01 00:00:00   3rd Qu.:17649   3rd Qu.:    7.0  
##  Max.   :2018-06-01 00:00:00   Max.   :25620   Max.   : 1173.0  
##   VariancePerc       
##  Min.   :-0.2299684  
##  1st Qu.:-0.0453534  
##  Median :-0.0050769  
##  Mean   :-0.0283730  
##  3rd Qu.: 0.0004377  
##  Max.   : 0.0735534
#To not see any number like "1e+05" we will penalize the scientific notation version by 999

options(scipen=999)

let’s have a first look at the data using ggplot2.

We have used Caseload as a constant element in all graphics for the x axis and we have used ActualValue, ForecastValue, and Variance for the y axis for each dataset we did load here

ESA_TANF_Child_Only  %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")

Use span to control the “wiggliness” of the default loess smoother. The span is the fraction of points used to fit each local regression. small numbers make a wigglier curve, larger numbers make a smoother curve.

We added a loess smoothing curve. The span of .05 is the result of trial and error to give us the degree of smoothing we wanted.

ESA_TANF_Child_Only  %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.05)

ESA_TANF_Child_Only  %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.05)

We did use different dataset here. We made the graph interactive using plotly.

spa1 =COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spa1)
spa2 = COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.07)
ggplotly(spa2)
spa3 = COR_Adult_CCcontact %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.07)
ggplotly(spa3)
spb1 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spb1)
spb2 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spb2)
spb3 =MS_ADO_AEM %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spb3)
spc1=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spc1)
spc2=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spc2)
spc3=MS_ADO_MCS_Alien %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spc3)
spd1=MS_ADO_Total  %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spd1)
spd2=MS_ADO_Total  %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.05)
ggplotly(spd2)
spd3=MS_ADO_Total  %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.05)
ggplotly(spd3)
spe1=MS_ADO_QMB  %>% ggplot(aes(x=Caseload,y=ActualValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")
ggplotly(spe1)
spe2=MS_ADO_QMB  %>% ggplot(aes(x=Caseload,y=ForecastValue)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spe2)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.2124e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.8102e+06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 6.278e+12
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 1.2124e+09
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.8102e+06
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 6.278e+12
spe3=MS_ADO_QMB  %>% ggplot(aes(x=Caseload,y=Variance)) + geom_line() + geom_hline(yintercept = 0,color="purple")+
  geom_smooth(method = "loess",span=.2)
ggplotly(spe3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.2124e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.8102e+06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 6.278e+12
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 1.2124e+09
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 2.8102e+06
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 6.278e+12